Data Cleaning & Preparing

Loading Libraries

library(lubridate)
library(ggplot2)
library(tidyverse)
library(data.table)
library(lubridate)
library(stringr)
library(ggplot2)
library(plotly)

Data Loading

  features_data_set <- read.csv2("Features data set.csv", header = TRUE, sep = ",")
  sales_data_set <- read.csv2("sales data-set.csv", header = TRUE, sep = ",")
  stores_data_set <- read.csv2("stores data-set.csv", header = TRUE, sep = ",")
  
  str(features_data_set)
## 'data.frame':    8190 obs. of  12 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Factor w/ 182 levels "01/02/2013","01/03/2013",..: 25 67 109 151 26 68 110 152 8 50 ...
##  $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 1066 864 934 1308 1300 1998 1797 1601 2315 2579 ...
##  $ Fuel_Price  : Factor w/ 1011 levels "2.472","2.513",..: 16 10 3 12 43 58 88 95 87 121 ...
##  $ MarkDown1   : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : Factor w/ 2505 levels "126.064","126.0766452",..: 1124 1143 1148 1150 1154 1157 1139 1115 1096 1084 ...
##  $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 286 286 286 286 286 286 286 286 252 252 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
  str(sales_data_set)
## 'data.frame':    421570 obs. of  5 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 20 53 86 119 21 54 87 120 6 39 ...
##  $ Weekly_Sales: Factor w/ 359464 levels "-0.02","-0.04",..: 140272 237401 220814 101950 120302 114885 122532 148007 272542 225936 ...
##  $ IsHoliday   : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
  str(stores_data_set)
## 'data.frame':    45 obs. of  3 variables:
##  $ Store: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
##  $ Size : int  151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...

Data type converting

features_data_set$Year <- substr(features_data_set$Date, 7, 10)
features_data_set$Month <- substr(features_data_set$Date, 4, 5)
features_data_set$Day <- substr(features_data_set$Date, 1, 2)

sales_data_set$Year <- substr(sales_data_set$Date, 7, 10)
sales_data_set$Month <- substr(sales_data_set$Date, 4, 5)
sales_data_set$Day <- substr(sales_data_set$Date, 1, 2)

sales_data_set$Weekly_Sales <- as.character(sales_data_set$Weekly_Sales)
sales_data_set$Weekly_Sales <- as.numeric(sales_data_set$Weekly_Sales,2)

Looking at the store numbers

 ggplot(stores_data_set, aes(Type, fill = Type ) ) +
  geom_bar() +
  xlab("Type of Store") + ylab("Count of Store")

Looking at the Sales of the Years

YearSales <- sales_data_set %>% group_by(Year) %>% summarise(YearSales = sum(Weekly_Sales))


ggplot(YearSales, aes(Year, YearSales)) +
        geom_col()

Analyzing the store sizes

SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")

ggplot(SalesStore, aes(Type, Size) ,log = "xy") +
  geom_point()

Looking at the relationship between Store Sizes & Weekly Sales

plot(SalesStore$Size,SalesStore$Weekly_Sales, main = "Size vs Sales", xlab = "Store Size", ylab = "Weekly Sales")

Clustering of the stores according to the montly sales

Looking at the percentages of the montly sales by the stores

Sales<-data.table(sales_data_set)
Features<-data.table(features_data_set)
Stores<-data.table(stores_data_set)

Sales<-Sales[,list(Store,Dept,Date,Weekly_Sales)]

setkey(Sales,Store,Date)
setkey(Features,Store,Date)

Sales<-Features[Sales]

setkey(Sales,Store)
setkey(Stores,Store)

Sales<-Stores[Sales]

str(Sales)
## Classes 'data.table' and 'data.frame':   421570 obs. of  19 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Type        : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Size        : int  151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
##  $ Date        : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 2096 2096 2096 2096 2096 2096 2096 2096 2096 2096 ...
##  $ Fuel_Price  : Factor w/ 1011 levels "2.472","2.513",..: 558 558 558 558 558 558 558 558 558 558 ...
##  $ MarkDown1   : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown2   : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown3   : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown4   : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ MarkDown5   : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ CPI         : Factor w/ 2505 levels "126.064","126.0766452",..: 1524 1524 1524 1524 1524 1524 1524 1524 1524 1524 ...
##  $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 238 238 238 238 238 238 238 238 238 238 ...
##  $ IsHoliday   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Year        : chr  "2011" "2011" "2011" "2011" ...
##  $ Month       : chr  "04" "04" "04" "04" ...
##  $ Day         : chr  "01" "01" "01" "01" ...
##  $ Dept        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Weekly_Sales: num  20398 46992 8734 34452 23599 ...
##  - attr(*, "sorted")= chr "Store"
##  - attr(*, ".internal.selfref")=<externalptr>
summary(Sales)
##      Store      Type            Size                Date       
##  Min.   : 1.0   A:215478   Min.   : 34875   23/12/2011:  3027  
##  1st Qu.:11.0   B:163495   1st Qu.: 93638   25/11/2011:  3021  
##  Median :22.0   C: 42597   Median :140167   16/12/2011:  3013  
##  Mean   :22.2              Mean   :136728   09/12/2011:  3010  
##  3rd Qu.:33.0              3rd Qu.:202505   17/02/2012:  3007  
##  Max.   :45.0              Max.   :219622   30/12/2011:  3003  
##                                             (Other)   :403489  
##   Temperature       Fuel_Price       MarkDown1        MarkDown2     
##  50.43  :   709   3.638  :  2548   1.5    :   102   1.91   :   539  
##  67.87  :   646   3.63   :  2164   460.73 :   102   3      :   493  
##  72.62  :   594   2.771  :  1917   175.64 :    93   0.5    :   485  
##  76.67  :   583   3.891  :  1856   1282.42:    75   1.5    :   471  
##  70.28  :   563   3.594  :  1796   1483.17:    75   4      :   367  
##  76.03  :   555   3.524  :  1793   (Other):150234   (Other):108893  
##  (Other):417920   (Other):409496   NA's   :270889   NA's   :310322  
##    MarkDown3        MarkDown4         MarkDown5               CPI        
##  3      :   754   9      :   280   2743.18 :   136   129.8555333:   711  
##  6      :   710   4      :   200   1064.56 :   120   131.1083333:   708  
##  2      :   660   2      :   197   20371.02:    75   129.8459667:   707  
##  1      :   611   3      :   146   3557.67 :    75   130.3849032:   706  
##  0.22   :   487   47     :   143   3567.03 :    75   130.6457931:   706  
##  (Other):133869   (Other):134001   (Other) :150951   130.683    :   706  
##  NA's   :284479   NA's   :286603   NA's    :270138   (Other)    :417326  
##   Unemployment    IsHoliday           Year              Month          
##  8.099  :  5152   Mode :logical   Length:421570      Length:421570     
##  8.163  :  3636   FALSE:391909    Class :character   Class :character  
##  7.852  :  3614   TRUE :29661     Mode  :character   Mode  :character  
##  7.343  :  3416                                                        
##  7.057  :  3414                                                        
##  7.931  :  3400                                                        
##  (Other):398938                                                        
##      Day                 Dept        Weekly_Sales   
##  Length:421570      Min.   : 1.00   Min.   : -4989  
##  Class :character   1st Qu.:18.00   1st Qu.:  2080  
##  Mode  :character   Median :37.00   Median :  7612  
##                     Mean   :44.26   Mean   : 15981  
##                     3rd Qu.:74.00   3rd Qu.: 20206  
##                     Max.   :99.00   Max.   :693099  
## 
Sales[,Month:=as.numeric(substring(as.character(Date),4,5))]

Sales$Date<-dmy(Sales$Date)

MonthlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Month)]

setnames(MonthlySales,"V1","Monthly_Sales")

MonthlySales[,TotalSales:=sum(Monthly_Sales,na.rm = TRUE),.(Month)]

MonthlySales[,SalesPercantage:=Monthly_Sales*1.0/TotalSales]

Clusno<-5

CM=dcast.data.table(MonthlySales,Store~Month,value.var="SalesPercantage")

S<-colnames(CM)
CM<-data.frame(CM)
CM[is.na(CM)]=0
colnames(CM)<-S
CM<-data.table(CM)

# basl<-which(colnames(rr)=="2")
# bitis<-which(colnames(rr)=="269")
set.seed(7)
CM[,clusno:=kmeans(CM[,c(2:ncol(CM)),with=F],Clusno)$cluster]

clusters<-CM[,list(Store,clusno)]

setkey(clusters,Store)

setkey(MonthlySales,Store)

MonthlySales<-clusters[MonthlySales]

SalesP<-dcast.data.table(MonthlySales,Month~Store,value.var="SalesPercantage")

MonthlySales$Month <- factor(MonthlySales$Month)
MonthlySales$Store <- factor(MonthlySales$Store)
MonthlySales$clusno <- factor(MonthlySales$clusno)

# plotting reference lines across each facet:

referenceLines <- MonthlySales  # \/ Rename
colnames(referenceLines)[2] <- "groupVar"
zp <- ggplot(MonthlySales,
             aes(x = Month, y = SalesPercantage))
zp <- zp + geom_line(data = referenceLines,  # Plotting the "underlayer"
                     aes(x = Month, y = SalesPercantage, group = groupVar),
                     colour = "GRAY", alpha = 1/2, size = 1/2)
zp <- zp + geom_line(size = 1)  # Drawing the "overlayer"
zp <- zp + facet_wrap(~ Store)
zp <- zp + theme_bw()
ggplotly()
ggplot(MonthlySales, aes(x=Month, y=SalesPercantage, color=clusno, group=Store)) +
  geom_line()

ggplotly()

###References #_Retail Data Analytics. (2017, August). Retrieved from https://www.kaggle.com/manjeetsingh/retaildataset_